Purpose:
Runs survival analysis models using splicing cluster assignment and 1) single exon splicing burden index (SBI) 2) KEGG Spliceosome GSVA scores or 3) CLK1 exon 4 TPM as a predictor
Uses a wrapper function (survival_analysis) from utils
folder.
Load packages, set directory paths and call setup script
library(tidyverse)
library(survival)
library(ggpubr)
library(ggplot2)
library(patchwork)
root_dir <- rprojroot::find_root(rprojroot::has_dir(\.git\))
data_dir <- file.path(root_dir, \data\)
analysis_dir <- file.path(root_dir, \analyses\, \survival\)
input_dir <- file.path(analysis_dir, \results\)
results_dir <- file.path(analysis_dir, \results\)
plot_dir <- file.path(analysis_dir, \plots\)
# If the input and results directories do not exist, create it
if (!dir.exists(results_dir)) {
dir.create(results_dir, recursive = TRUE)
}
source(file.path(analysis_dir, \util\, \survival_models.R\))
knitr::opts_chunk$set(cache = FALSE)
Set metadata and cluster assignment file paths
metadata_file <- file.path(input_dir, \splicing_indices_with_survival.tsv\)
cluster_file <- file.path(root_dir, \analyses\,
\sample-psi-clustering\, \results\,
\sample-cluster-metadata-top-5000-events-stranded.tsv\)
kegg_scores_stranded_file <- file.path(root_dir, \analyses\,
\sample-psi-clustering\, \results\,
\gsva_output_stranded.tsv\)
tpm_file <- file.path(data_dir, \rna-isoform-expression-rsem-tpm.rds\)
clk1_psi_file <- file.path(root_dir,
\analyses\,
\CLK1-splicing_correlations\,
\results\,
\clk1-exon4-psi.tsv\)
Wrangle data Add cluster assignment and spliceosome gsva scores to
metadata and define column lgg_group (LGG or
non_LGG)
metadata <- read_tsv(metadata_file)
clusters <- read_tsv(cluster_file) %>%
dplyr::rename(Kids_First_Biospecimen_ID = sample_id)
clk1_psi <- read_tsv(clk1_psi_file) %>%
dplyr::rename(CLK1_ex4_PSI = PSI) %>%
select(-plot_group)
gsva_scores <- read_tsv(kegg_scores_stranded_file) %>%
dplyr::filter(geneset == \KEGG_SPLICEOSOME\) %>%
dplyr::rename(spliceosome_gsva_score = score)
all_clk4_transcr_counts <- readRDS(tpm_file) %>%
filter(grepl(\^CLK1\, gene_symbol)) %>%
mutate(
transcript_id = case_when(
transcript_id %in% c(\ENST00000321356.9\, \ENST00000434813.3\, \ENST00000409403.6\) ~ \Exon 4\,
# transcript_id == \ENST00000321356.9\ ~ \Exon 4\,
TRUE ~ \Other\
)
) %>%
group_by(transcript_id) %>%
summarise(across(starts_with(\BS\), sum, na.rm = TRUE)) %>%
pivot_longer(cols = -transcript_id, names_to = \Kids_First_Biospecimen_ID\, values_to = \CLK1_Ex4_TPM\) %>%
filter(transcript_id == \Exon 4\) %>%
inner_join(clusters, by = \Kids_First_Biospecimen_ID\) %>%
left_join(clk1_psi)
# how many clusters?
n_clust <- length(unique(clusters$cluster))
metadata <- metadata %>%
right_join(all_clk4_transcr_counts %>% dplyr::select(Kids_First_Biospecimen_ID,
cluster, CLK1_Ex4_TPM, CLK1_ex4_PSI)) %>%
left_join(gsva_scores %>% dplyr::select(sample_id,
spliceosome_gsva_score),
by = c(\Kids_First_Biospecimen_ID\ = \sample_id\)) %>%
dplyr::mutate(cluster = glue::glue(\Cluster {cluster}\)) %>%
# dplyr::mutate(cluster = fct_relevel(cluster,
# paste0(\Cluster \, 1:n_clust))) %>%
dplyr::mutate(cluster = forcats::fct_relevel(cluster, \Cluster 6\, after = 0)) %>%
dplyr::mutate(lgg_group = case_when(
plot_group == \Low-grade glioma\ ~ \LGG\,
TRUE ~ \non-LGG\
)) %>%
dplyr::mutate(SBI = SI_Total * 10) %>%
dplyr::mutate(age_at_diagnosis_years = age_at_diagnosis_days/365.25)
Generate coxph models including extent of tumor resection, lgg group, cluster assignment, SBI, and CLK1 exon 4 TPM as covariates
add_model_os <- fit_save_model(metadata[!metadata$extent_of_tumor_resection %in% c(\Not Reported\, \Unavailable\),],
terms = \extent_of_tumor_resection+lgg_group+cluster+age_at_diagnosis_years+SBI+CLK1_Ex4_TPM\,
file.path(results_dir, \cox_OS_additive_terms_resection_lgg_group_cluster_SBI_CLK1_Ex4_TPM.RDS\),
\multivariate\,
years_col = \OS_years\,
status_col = \OS_status\)
forest_os <- plotForest(readRDS(file.path(results_dir, \cox_OS_additive_terms_resection_lgg_group_cluster_SBI_CLK1_Ex4_TPM.RDS\)))
forest_os
ggsave(file.path(plot_dir, \forest_add_OS_resection_lgg_group_cluster_assignment_SBI_CLK1_Ex4_TPM.pdf\),
forest_os,
width = 10, height = 6, units = \in\,
device = \pdf\)
add_model_efs <- fit_save_model(metadata[!metadata$extent_of_tumor_resection %in% c(\Not Reported\, \Unavailable\),],
terms = \extent_of_tumor_resection+lgg_group+cluster+age_at_diagnosis_years+SBI+CLK1_Ex4_TPM\,
file.path(results_dir, \cox_EFS_additive_terms_resection_lgg_group_cluster_SBI_CLK1_Ex4_TPM.RDS\),
\multivariate\,
years_col = \EFS_years\,
status_col = \EFS_status\)
forest_efs <- plotForest(readRDS(file.path(results_dir, \cox_EFS_additive_terms_resection_lgg_group_cluster_SBI_CLK1_Ex4_TPM.RDS\)))
forest_efs
ggsave(file.path(plot_dir, \forest_add_EFS_resection_lgg_group_cluster_assignment_SBI_CLK1_Ex4_TPM.pdf\),
forest_efs,
width = 10, height = 6, units = \in\,
device = \pdf\)
repeat analysis with CLK1 exon 4 TPM alone
add_model_os <- fit_save_model(metadata[!metadata$extent_of_tumor_resection %in% c(\Not Reported\, \Unavailable\),],
terms = \extent_of_tumor_resection+lgg_group+cluster+age_at_diagnosis_years+CLK1_Ex4_TPM\,
file.path(results_dir, \cox_OS_additive_terms_resection_lgg_group_cluster_CLK1_Ex4_TPM.RDS\),
\multivariate\,
years_col = \OS_years\,
status_col = \OS_status\)
forest_os <- plotForest(readRDS(file.path(results_dir, \cox_OS_additive_terms_resection_lgg_group_cluster_CLK1_Ex4_TPM.RDS\)))
forest_os
ggsave(file.path(plot_dir, \forest_add_OS_resection_lgg_group_cluster_assignment_CLK1_Ex4_TPM.pdf\),
forest_os,
width = 10, height = 6, units = \in\,
device = \pdf\)
add_model_efs <- fit_save_model(metadata[!metadata$extent_of_tumor_resection %in% c(\Not Reported\, \Unavailable\),],
terms = \extent_of_tumor_resection+lgg_group+cluster+age_at_diagnosis_years+CLK1_Ex4_TPM\,
file.path(results_dir, \cox_EFS_additive_terms_resection_lgg_group_cluster_CLK1_Ex4_TPM.RDS\),
\multivariate\,
years_col = \EFS_years\,
status_col = \EFS_status\)
forest_efs <- plotForest(readRDS(file.path(results_dir, \cox_EFS_additive_terms_resection_lgg_group_cluster_CLK1_Ex4_TPM.RDS\)))
forest_efs
ggsave(file.path(plot_dir, \forest_add_EFS_resection_lgg_group_cluster_assignment_CLK1_Ex4_TPM.pdf\),
forest_efs,
width = 10, height = 6, units = \in\,
device = \pdf\)
repeat analysis with CLK1 exon 4 PSI
add_model_os <- fit_save_model(metadata[!metadata$extent_of_tumor_resection %in% c(\Not Reported\, \Unavailable\),],
terms = \extent_of_tumor_resection+lgg_group+cluster+age_at_diagnosis_years+CLK1_ex4_PSI\,
file.path(results_dir, \cox_OS_additive_terms_resection_lgg_group_cluster_CLK1_ex4_PSI.RDS\),
\multivariate\,
years_col = \OS_years\,
status_col = \OS_status\)
forest_os <- plotForest(readRDS(file.path(results_dir, \cox_OS_additive_terms_resection_lgg_group_cluster_CLK1_ex4_PSI.RDS\)))
forest_os
ggsave(file.path(plot_dir, \forest_add_OS_resection_lgg_group_cluster_assignment_CLK1_ex4_PSI.pdf\),
forest_os,
width = 10, height = 6, units = \in\,
device = \pdf\)
add_model_efs <- fit_save_model(metadata[!metadata$extent_of_tumor_resection %in% c(\Not Reported\, \Unavailable\),],
terms = \extent_of_tumor_resection+lgg_group+cluster+age_at_diagnosis_years+CLK1_ex4_PSI\,
file.path(results_dir, \cox_EFS_additive_terms_resection_lgg_group_cluster_CLK1_ex4_PSI.RDS\),
\multivariate\,
years_col = \EFS_years\,
status_col = \EFS_status\)
forest_efs <- plotForest(readRDS(file.path(results_dir, \cox_EFS_additive_terms_resection_lgg_group_cluster_CLK1_ex4_PSI.RDS\)))
forest_efs
ggsave(file.path(plot_dir, \forest_add_EFS_resection_lgg_group_cluster_assignment_CLK1_ex4_PSI.pdf\),
forest_efs,
width = 10, height = 6, units = \in\,
device = \pdf\)
Interaction with GSVA, SBI, CLK1
models <- c("spliceosome_gsva_score", "SBI", "CLK1_Ex4_TPM", "CLK1_ex4_PSI")
# by cluster
for (each in models) {
int_model_efs <- fit_save_model(metadata[!metadata$extent_of_tumor_resection %in% c("Not Reported", "Unavailable"),],
terms = paste0("extent_of_tumor_resection+lgg_group+cluster*", each, "+age_at_diagnosis_years"),
file.path(results_dir, paste0("cox_EFS_interaction_terms_resection_lgg_group_cluster_", each, ".RDS")),
"multivariate",
years_col = "EFS_years",
status_col = "EFS_status")
int_forest_efs <- plotForest(readRDS(file.path(results_dir, paste0("cox_EFS_interaction_terms_resection_lgg_group_cluster_", each, ".RDS"))))
int_forest_efs
ggsave(file.path(plot_dir, paste0("forest_int_EFS_resection_lgg_group_cluster_assignment_", each, ".pdf")),
int_forest_efs,
width = 10, height = 6, units = "in",
device = "pdf")
int_model_os <- fit_save_model(metadata[!metadata$extent_of_tumor_resection %in% c("Not Reported", "Unavailable"),],
terms = paste0("extent_of_tumor_resection+lgg_group+cluster*", each, "+age_at_diagnosis_years"),
file.path(results_dir, paste0("cox_OS_interaction_terms_resection_lgg_group_cluster_", each, ".RDS")),
"multivariate",
years_col = "OS_years",
status_col = "OS_status")
int_forest_os <- plotForest(readRDS(file.path(results_dir, paste0("cox_OS_interaction_terms_resection_lgg_group_cluster_", each, ".RDS"))))
int_forest_os
ggsave(file.path(plot_dir, paste0("forest_int_OS_resection_lgg_group_cluster_assignment_", each, ".pdf")),
int_forest_os,
width = 10, height = 6, units = "in",
device = "pdf")
}
## clk1 x age
int_model_efs <- fit_save_model(metadata[!metadata$extent_of_tumor_resection %in% c("Not Reported", "Unavailable"),],
terms = paste0("extent_of_tumor_resection+lgg_group+cluster+CLK1_Ex4_TPM*age_at_diagnosis_years"),
file.path(results_dir, paste0("cox_EFS_interaction_terms_resection_lgg_group_cluster_CLK1_Ex4_TPM_age.RDS")),
"multivariate",
years_col = "EFS_years",
status_col = "EFS_status")
int_forest_efs <- plotForest(readRDS(file.path(results_dir, paste0("cox_EFS_interaction_terms_resection_lgg_group_cluster_CLK1_Ex4_TPM_age.RDS"))))
int_forest_efs
ggsave(file.path(plot_dir, paste0("forest_int_EFS_resection_lgg_group_cluster_clk1_age.pdf")),
int_forest_efs,
width = 10, height = 6, units = "in",
device = "pdf")
int_model_os <- fit_save_model(metadata[!metadata$extent_of_tumor_resection %in% c("Not Reported", "Unavailable"),],
terms = paste0("extent_of_tumor_resection+lgg_group+cluster+CLK1_Ex4_TPM*age_at_diagnosis_years"),
file.path(results_dir, paste0("cox_OS_interaction_terms_resection_lgg_group_cluster_clk1_age.RDS")),
"multivariate",
years_col = "OS_years",
status_col = "OS_status")
int_forest_os <- plotForest(readRDS(file.path(results_dir, paste0("cox_OS_interaction_terms_resection_lgg_group_cluster_clk1_age.RDS"))))
int_forest_os
ggsave(file.path(plot_dir, paste0("forest_int_OS_resection_lgg_group_cluster_clk1_age.pdf")),
int_forest_os,
width = 10, height = 6, units = "in",
device = "pdf")
models2 <- c("SBI", "CLK1_Ex4_TPM", "CLK1_ex4_PSI")
for (each in models2) {
#### by spliceosome_gsva_score
int_model_efs <- fit_save_model(metadata[!metadata$extent_of_tumor_resection %in% c("Not Reported", "Unavailable"),],
terms = paste0("extent_of_tumor_resection+lgg_group+cluster+spliceosome_gsva_score*", each, "+age_at_diagnosis_years"),
file.path(results_dir, paste0("cox_EFS_interaction_terms_resection_lgg_group_cluster_spliceosome_gsva_score_", each, ".RDS")),
"multivariate",
years_col = "EFS_years",
status_col = "EFS_status")
int_forest_efs <- plotForest(readRDS(file.path(results_dir, paste0("cox_EFS_interaction_terms_resection_lgg_group_cluster_spliceosome_gsva_score_", each, ".RDS"))))
int_forest_efs
ggsave(file.path(plot_dir, paste0("cox_EFS_interaction_terms_resection_lgg_group_cluster_spliceosome_gsva_score_", each, ".pdf")),
int_forest_efs,
width = 10, height = 6, units = "in",
device = "pdf")
}
add_model_efs <- fit_save_model(metadata[!metadata$extent_of_tumor_resection %in% c("Not Reported", "Unavailable"),],
terms = "extent_of_tumor_resection+lgg_group+cluster+age_at_diagnosis_years+spliceosome_gsva_score+CLK1_Ex4_TPM",
file.path(results_dir, "cox_EFS_additive_terms_resection_lgg_group_cluster_spliceosome_score_CLK1_Ex4_TPM.RDS"),
"multivariate",
years_col = "EFS_years",
status_col = "EFS_status")
forest_efs <- plotForest(readRDS(file.path(results_dir, "cox_EFS_additive_terms_resection_lgg_group_cluster_spliceosome_score_CLK1_Ex4_TPM.RDS")))
forest_efs
ggsave(file.path(plot_dir, "forest_add_EFS_resection_lgg_group_cluster_assignment_spliceosome_score_CLK1_Ex4_TPM.pdf"),
forest_efs,
width = 10, height = 6, units = "in",
device = "pdf")
add_model_os <- fit_save_model(metadata[!metadata$extent_of_tumor_resection %in% c("Not Reported", "Unavailable"),],
terms = "extent_of_tumor_resection+lgg_group+cluster+age_at_diagnosis_years+spliceosome_gsva_score+CLK1_Ex4_TPM",
file.path(results_dir, "cox_OS_additive_terms_resection_lgg_group_cluster_spliceosome_score_CLK1_Ex4_TPM.RDS"),
"multivariate",
years_col = "EFS_years",
status_col = "EFS_status")
forest_os <- plotForest(readRDS(file.path(results_dir, "cox_OS_additive_terms_resection_lgg_group_cluster_spliceosome_score_CLK1_Ex4_TPM.RDS")))
forest_os
ggsave(file.path(plot_dir, "forest_add_OS_resection_lgg_group_cluster_assignment_spliceosome_score_CLK1_Ex4_TPM.pdf"),
forest_efs,
width = 10, height = 6, units = "in",
device = "pdf")
Filter for clusters
cluster_list <- unique(metadata$cluster)
for (each in cluster_list) {
cluster_df <- metadata %>%
dplyr::filter(cluster == each,
!is.na(EFS_days)) %>%
dplyr::mutate(CLK1_TPM_group = case_when(
CLK1_Ex4_TPM > summary(CLK1_Ex4_TPM)[\3rd Qu.\] ~ \High CLK1 TPM\,
CLK1_Ex4_TPM < summary(CLK1_Ex4_TPM)[\1st Qu.\] ~ \Low CLK1 TPM\,
TRUE ~ NA_character_),
CLK1_PSI_group = case_when(CLK1_ex4_PSI > summary(CLK1_ex4_PSI)[\3rd Qu.\] ~ \High CLK1 PSI\,
CLK1_ex4_PSI < summary(CLK1_ex4_PSI)[\1st Qu.\] ~ \Low CLK1 PSI\,
TRUE ~ NA_character_
)) %>%
dplyr::mutate(CLK1_TPM_group = fct_relevel(CLK1_TPM_group,
c(\Low CLK1 TPM\, \High CLK1 TPM\)),
CLK1_PSI_group = fct_relevel(CLK1_PSI_group,
c(\Low CLK1 PSI\, \High CLK1 PSI\))) %>%
# collapse groups which do not have min N
dplyr::mutate(
plot_group = forcats::fct_drop(plot_group),
plot_group = forcats::fct_lump_min(plot_group, min = 3, other_level = \Collapsed\)
)
safe_each <- gsub(\[^A-Za-z0-9_-]+\, \_\, each)
# Generate KM models with `CLK1_TPM_group` as covariate
# Generate kaplan meier survival models for OS and EFS, and save outputs
cluster_clk_tpm_kap_os <- survival_analysis(
metadata = cluster_df %>%
dplyr::filter(!is.na(CLK1_TPM_group)),
ind_var = \CLK1_TPM_group\,
test = \kap.meier\,
metadata_sample_col = \Kids_First_Biospecimen_ID\,
days_col = \OS_days\,
status_col = \OS_status\
)
readr::write_rds(cluster_clk_tpm_kap_os,
file.path(results_dir, paste0( \logrank_\, safe_each, \_OS_clk1_tpm_group.RDS\)))
cluster_clk_tpm_kap_efs <- survival_analysis(
metadata = cluster_df %>%
dplyr::filter(!is.na(CLK1_TPM_group)),
ind_var = \CLK1_TPM_group\,
test = \kap.meier\,
metadata_sample_col = \Kids_First_Biospecimen_ID\,
days_col = \EFS_days\,
status_col = \EFS_status\
)
readr::write_rds(cluster_clk_tpm_kap_efs,
file.path(results_dir, paste0( \logrank_\, safe_each, \_EFS_clk1_tpm_group.RDS\)))
# Generate KM models with `CLK1_PSI_group` as covariate
# Generate kaplan meier survival models for OS and EFS, and save outputs
cluster_clk_psi_kap_os <- survival_analysis(
metadata = cluster_df %>%
dplyr::filter(!is.na(CLK1_PSI_group)),
ind_var = \CLK1_PSI_group\,
test = \kap.meier\,
metadata_sample_col = \Kids_First_Biospecimen_ID\,
days_col = \OS_days\,
status_col = \OS_status\
)
readr::write_rds(cluster_clk_psi_kap_os,
file.path(results_dir, paste0( \logrank_\, safe_each, \_OS_clk1_psi_group.RDS\)))
cluster_clk_psi_kap_efs <- survival_analysis(
metadata = cluster_df %>%
dplyr::filter(!is.na(CLK1_PSI_group)),
ind_var = \CLK1_PSI_group\,
test = \kap.meier\,
metadata_sample_col = \Kids_First_Biospecimen_ID\,
days_col = \EFS_days\,
status_col = \EFS_status\
)
readr::write_rds(cluster_clk_psi_kap_efs,
file.path(results_dir, paste0( \logrank_\, safe_each, \_EFS_clk1_psi_group.RDS\)))
# Generate cluster KM SI_group plots
km_cluster_clk_tpm_os_plot <- plotKM(model = cluster_clk_tpm_kap_os,
variable = \CLK1_TPM_group\,
combined = F,
title = paste0(each, \
sessionInfo()
R version 4.4.0 (2024-04-24)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 22.04.4 LTS
Matrix products: default
BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: Etc/UTC
tzcode source: system (glibc)
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] gtools_3.9.5 survminer_0.4.9 patchwork_1.2.0 ggpubr_0.6.0
[5] survival_3.7-0 lubridate_1.9.4 forcats_1.0.1 stringr_1.6.0
[9] dplyr_1.1.4 purrr_1.2.0 readr_2.1.6 tidyr_1.3.1
[13] tibble_3.3.0 ggplot2_4.0.1 tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] gtable_0.3.6 xfun_0.54 bslib_0.9.0 rstatix_0.7.2
[5] lattice_0.22-7 tzdb_0.5.0 vctrs_0.6.5 tools_4.4.0
[9] generics_0.1.4 parallel_4.4.0 pkgconfig_2.0.3 Matrix_1.7-4
[13] data.table_1.17.8 RColorBrewer_1.1-3 S7_0.2.1 lifecycle_1.0.4
[17] compiler_4.4.0 farver_2.1.2 textshaping_1.0.4 carData_3.0-5
[21] colorblindr_0.1.0 htmltools_0.5.8.1 sass_0.4.10 yaml_2.3.10
[25] crayon_1.5.3 pillar_1.11.1 car_3.1-2 jquerylib_0.1.4
[29] cachem_1.1.0 abind_1.4-5 km.ci_0.5-6 commonmark_2.0.0
[33] tidyselect_1.2.1 digest_0.6.39 stringi_1.8.7 labeling_0.4.3
[37] splines_4.4.0 cowplot_1.1.3 rprojroot_2.1.1 fastmap_1.2.0
[41] grid_4.4.0 colorspace_2.1-2 cli_3.6.5 magrittr_2.0.4
[45] broom_1.0.10 withr_3.0.2 scales_1.4.0 backports_1.5.0
[49] bit64_4.6.0-1 timechange_0.3.0 rmarkdown_2.30 ggtext_0.1.2
[53] bit_4.6.0 gridExtra_2.3 ggsignif_0.6.4 ragg_1.5.0
[57] zoo_1.8-12 hms_1.1.4 evaluate_1.0.5 knitr_1.50
[61] KMsurv_0.1-5 markdown_1.13 survMisc_0.5.6 rlang_1.1.6
[65] Rcpp_1.1.0 gridtext_0.1.5 xtable_1.8-4 glue_1.8.0
[69] xml2_1.5.0 vroom_1.6.6 jsonlite_2.0.0 R6_2.6.1
[73] systemfonts_1.3.1